This notebook goes through the following plan:
import pandas as pd
import numpy as np
from collections import defaultdict
from matplotlib import pyplot as plt
listing_filename = 'listings.csv'
cols = ['id', 'name', 'neighbourhood_cleansed', 'neighborhood_overview', 'latitude', 'longitude']
df = pd.read_csv(listing_filename, usecols=cols)
df.count()
# From the selected columns, some rows have only their neighborhood_overview text missing.
# Since we gather overviews by neighborhood for this analysis, we do not want redundant or ungenuine overviews.
# Therefore we drop these rows with missing neighborhood_overview values.
df.dropna(inplace=True)
- split to sentences
- tokenize
- lemmatize
- discard stopwords and others
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer, before="parser")
doc = nlp("This is a sentence. This is another sentence.")
# Let's first create a function that decides what to exclude of the tokens of the sentence
def is_excluded(token, avoidwords=[]):
'''
This function decides whether to include 'token' to the list of lemmas
INPUT
token - token object in a doc
avoidwords - additional words to exclude
OUTPUT
is_excluded - boolean - whether to exlude the token to the list of lemmas
'''
rules = [token.is_stop,
token.is_punct,
token.is_space,
token.like_num,
token.lemma_ in avoidwords
]
return any(rules)
# Create a function splits a neighborhood_overview text into sentences,
# discards some tokens and returns the sentences and the sentences after processing
def prepare_sentences(overview, avoidwords=[]):
'''
This function splits a neighborhood_overview text into sentences,
discards some tokens and returns the sentences and the sentences after processing
INPUT
overview - text from neighborhood_overview
avoidwords - specific words to exclude
OUTPUT
sentences_lemmas - list of sentences of the overview text after processing
sentences - boolean - list of sentences of the overview text
'''
if pd.notnull(overview): # or isinstance(text, str):
doc = nlp(overview)
sentences = []
sentences_lemmas = []
for sent in doc.sents:
lemmas = []
for token in sent:
if not is_excluded(token, avoidwords):
lemmas.append(token.lemma_)
sent_lemmas = ' '.join(lemmas)
if len(sent_lemmas):
sentences_lemmas.append(sent_lemmas)
sentences.append(sent.text)
return sentences_lemmas, sentences
else:
print("warning: empty text")
return [], []
# test
prepare_sentences(u"""Tesla is looking at buying a U.S. startup in Boston for $6 million.
Startups are becoming juicy minutes after it.""",
avoidwords=['Boston'])
# Apply prepare_sentences() to the whole dataset
# Each sentence can be linked to the neighborhood it is about using its index in the list
def prepare_data(dt, avoidwords=[]):
'''
This function applies prepare_sentences() to a dataframe
INPUT
dt - dataframe including neighbourhood_cleansed and neighborhood_overview
avoidwords - specific words to exclude
OUTPUT
sents_lemmas - list of sentences from the neighborhood overviews after processing
sents - list of sentences from the neighborhood overviews
neighborhood_sentids - dictionary neighborhood --> ids of sentences in sents or sents_lemmas to
link each sentence in sents (or sents_lemmas) to the neighborhood it is about
'''
sents = []
sents_lemmas = []
#ntexts = []
neighborhood_sentids = defaultdict(list)
last_sentid = 0
for index, overview, neighbourhood in zip(dt.index, dt.neighborhood_overview, dt.neighbourhood_cleansed):
sent_lemmas, sent = prepare_sentences(overview, avoidwords)
sents_lemmas += sent_lemmas
sents += sent
neighborhood_sentids[neighbourhood] += [i for i in range(last_sentid, last_sentid+len(sent))]
last_sentid += len(sent)
return sents_lemmas, sents, neighborhood_sentids
# test
sents_lemmas, sents, neighborhood_sentids = prepare_data(df.loc[:1, ["neighborhood_overview",
"neighbourhood_cleansed"]])
print("#### sents_lemmas:", sents_lemmas, sep='\n')
print("#### sents:", sents, sep='\n')
print("#### neighborhood_sentids:", neighborhood_sentids, sep='\n')
neighborhoods = df["neighbourhood_cleansed"].unique()
X = df[["neighborhood_overview", "neighbourhood_cleansed"]]
# create sentences
# N.B. neighborhood names are excluded for the topics to remain unrelated to the neighborhoods
sents_lemmas, sents, neighborhood_sentids = prepare_data(X,
avoidwords=['Boston','neighborhood','jp', 'JP'] +' '.join(neighborhoods).split())
# First create a document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(sents_lemmas)
from sklearn.decomposition import LatentDirichletAllocation
nbtopics = 5
LDA = LatentDirichletAllocation(n_components=nbtopics, random_state=42,
max_iter=80, evaluate_every=2)
# This can take awhile, we're dealing with a large amount of documents!
LDA.fit(dtm)
LDA.n_iter_, LDA.perplexity(dtm)
# Showing Top Words Per Topic:
for index,topic in enumerate(LDA.components_):
print(f'THE TOP 20 WORDS FOR TOPIC #{index+1}')
print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]][::-1])
print('\n')
# For each neighborhood: How much proportion of the overviews are about topic X ?
topic_results = LDA.transform(dtm)
topic_asgn = topic_results.argmax(axis=1)
d = {**{"neighborhood": list(neighborhood_sentids.keys())}, **{f"topic {i+1}":[] for i in range(nbtopics)}}
d["sentences_count"] = []
d["dominant_topic"] = []
for nghbd, sentids in neighborhood_sentids.items():
c = np.average(topic_results[sentids], axis=0)
for i in range(nbtopics):
d[f"topic {i+1}"].append(c[i])
d["sentences_count"].append(len(sentids))
d["dominant_topic"].append(np.argmax(c))
H = pd.DataFrame(d).sort_values(by="sentences_count", ascending=False).round(3)
H.set_index("neighborhood", inplace=True)
H.style.background_gradient(axis=0, subset=list(f"topic {i+1}" for i in range(nbtopics)))
First, we create functions to get top neighborhoods per question and visualize.
def get_top_neighborhoods(H, topic, max_rank=3):
"""
Gets top neighborhoods by the proportion of overviews dedicated to a topic
INPUTS:
H - dataframe - dataframe of topic proportions by neighborhood
topic - string - topic of choice e.g. "topic 2"
max_rank - integer - length of the desired ranking (e.g. max_rank=3 to get top 3 neighborhoods)
OUTPUT:
top_neighborhoods - list of the top max_rank neighborhoods
"""
top_neighborhoods = H[topic].sort_values(ascending=False)
#top_neighborhoods = list(top_neighborhoods.index[:max_rank])
#return [(n, H.loc[n, topic]) for n in top_neighborhoods]
return list(top_neighborhoods.index[:max_rank])
# Test
get_top_neighborhoods(H, "topic 1", max_rank=3)
# Visualize a specific topic importance with a map of
# listings colored according to the topic proportion in their neighborhood
import plotly.express as px
from IPython.display import Image
def map_topic(df, H, topic):
"""
Creates a map of listings colored according to the topic proportion in their neighborhood
INPUTS:
df - dataframe - dataset of listings
H - dataframe - dataframe of topic proportions by neighborhood
topic - string - topic of choice e.g. "topic 2"
max_rank - integer - length of the desired ranking (e.g. max_rank=3 to get top 3 neighborhoods)
OUTPUT:
None
"""
df[topic] = df["neighbourhood_cleansed"].apply(lambda n: H.loc[n, topic])
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", color=topic,
hover_name="neighbourhood_cleansed", zoom=11,
center={'lat':42.32,'lon':-71.08}, width=700, height=700,
)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
return fig
max_rank = 3
topic = "topic 1"
top_neighborhoods = get_top_neighborhoods(H, topic, max_rank=3)
print(f"The top {max_rank} neighborhoods for {topic}:", ', '.join(top_neighborhoods))
fig = map_topic(df, H, topic)
img_bytes = fig.to_image(format="png")
Image(img_bytes)
max_rank = 3
topic = "topic 2"
top_neighborhoods = get_top_neighborhoods(H, topic, max_rank=3)
print(f"The top {max_rank} neighborhoods for {topic}:", ', '.join(top_neighborhoods))
fig = map_topic(df, H, topic)
img_bytes = fig.to_image(format="png")
Image(img_bytes)
max_rank = 3
topic = "topic 3"
top_neighborhoods = get_top_neighborhoods(H, topic, max_rank=3)
print(f"The top {max_rank} neighborhoods for {topic}:", ', '.join(top_neighborhoods))
fig = map_topic(df, H, topic)
img_bytes = fig.to_image(format="png")
Image(img_bytes)
max_rank = 3
topic = "topic 4"
top_neighborhoods = get_top_neighborhoods(H, topic, max_rank=3)
print(f"The top {max_rank} neighborhoods for {topic}:", ', '.join(top_neighborhoods))
fig = map_topic(df, H, topic)
img_bytes = fig.to_image(format="png")
Image(img_bytes)
max_rank = 3
topic = "topic 5"
top_neighborhoods = get_top_neighborhoods(H, topic, max_rank=3)
print(f"The top {max_rank} neighborhoods for {topic}:", ', '.join(top_neighborhoods))
fig = map_topic(df, H, topic)
img_bytes = fig.to_image(format="png")
Image(img_bytes)